Assignment-4

Turing Machine

  • Nikhil V Revankar - 01FB16ECS230
  • Vinayaka Kamath - 01FB16ECS445
  • Vikram G - 01FB16ECS484
# importing dataset for Question 1 and Question 2
tips_data<-read.csv(file="/Users/vikramg/Desktop/Data_Analytics/Assignment_4/tips.csv", header=TRUE, sep=",")
data<-read.csv(file="/Users/vikramg/Desktop/Data_Analytics/Assignment_4/tips_tailored.csv", header=TRUE, sep=",")

rmse = function(m, o){
  sqrt(mean((m - o)^2,na.rm = TRUE))
}

Question 1

Two forms of missing data: Missing completely at random (MCAR):This form exists when the missing values are randomly distributed across all observations. Missing at random (MAR): the missing values are not randomly distributed across observations but are distributed within one or more sub-samples.

Issue regarding missing data: If the missing values are not handled properly by the researcher, then he/she may end up drawing an inaccurate inference about the data. Due to improper handling, the result obtained by the researcher will differ from ones where the missing values are present.

a)The mean of the column

data_Odds_mean <- setNames(data.frame(ifelse(is.na(data$Odds), mean(data$Odds, na.rm=TRUE), data$Odds),data$Odds),c("Odds_mean","Odds"))
missing_mean = data_Odds_mean[is.na(data_Odds_mean$Odds),]
print("RMSE")
## [1] "RMSE"
print(rmse(tips_data$Odds,data_Odds_mean$Odds_mean))
## [1] 0.0779236

b)The median of the column

data_Odds_median <- setNames(data.frame(ifelse(is.na(data$Odds), median(data$Odds, na.rm=TRUE), data$Odds),data$Odds),c("Odds_median","Odds"))
missing_median <- data_Odds_median[is.na(data_Odds_median$Odds),] 
print("RMSE")
## [1] "RMSE"
print(rmse(tips_data$Odds,data_Odds_median$Odds_median))
## [1] 0.05705324

c)The mean value for that particular horse

library(dplyr)
horse_odds = subset(data,select=c("Horse","Odds"))
horse_mean = setNames(data.frame(horse_odds %>% group_by(Horse) %>% mutate(Odds = ifelse(is.na(Odds), mean(Odds, na.rm = TRUE), Odds)),data$Odds),c("Horse","Odds_horse_mean","Odds"))
missing_horse_mean = horse_mean[is.na(horse_mean$Odds),]
print("RMSE")
## [1] "RMSE"
print(rmse(tips_data$Odds,horse_mean$Odds_horse_mean))
## [1] 0.1551146

d)The interpolation of the data points (mean)

library(zoo)
data_Odds_interpolate = setNames(data.frame(na.approx(data$Odds),data$Odds),c('Odds_ip','Odds'))
missing_interpolate_Odds = data_Odds_interpolate[is.na(data_Odds_interpolate$Odds),]
print("RMSE")
## [1] "RMSE"
print(rmse(tips_data$Odds,data_Odds_interpolate$Odds_ip))
## [1] 0.0709432

e)Imputation

library(mice)

#get a better understanding of the pattern of missing data
md.pattern(data)

##       X UID ID Tipster Date Track Horse Bet.Type Result TipsterActive
## 38240 1   1  1       1    1     1     1        1      1             1
## 8     1   1  1       1    1     1     1        1      1             1
##       0   0  0       0    0     0     0        0      0             0
##       Predicted.Results Odds  
## 38240                 1    1 0
## 8                     1    0 1
##                       0    8 8
#visual representation
library(VIM)
aggr_plot <- aggr(data, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##           Variable        Count
##               Odds 0.0002091613
##                  X 0.0000000000
##                UID 0.0000000000
##                 ID 0.0000000000
##            Tipster 0.0000000000
##               Date 0.0000000000
##              Track 0.0000000000
##              Horse 0.0000000000
##           Bet.Type 0.0000000000
##             Result 0.0000000000
##      TipsterActive 0.0000000000
##  Predicted.Results 0.0000000000
#imputing missing data
imputedData <- mice(data,m=5,maxit=0,meth="pmm",seed=500)
complete_data = complete(imputedData)
summary(complete_data)
##        X              UID              ID             Tipster     
##  Min.   :    1   Min.   :    1   Min.   :   1   Tipster X : 4383  
##  1st Qu.: 9563   1st Qu.: 9563   1st Qu.: 318   Tipster E : 3700  
##  Median :19124   Median :19124   Median : 749   Tipster B1: 2497  
##  Mean   :19124   Mean   :19124   Mean   :1013   Tipster A1: 2446  
##  3rd Qu.:28686   3rd Qu.:28686   3rd Qu.:1419   Tipster D1: 2119  
##  Max.   :38248   Max.   :38248   Max.   :4383   Tipster J : 1937  
##                                                 (Other)   :21166  
##          Date                 Track                     Horse      
##  30-07-2016:  110   Kempton      : 2197   Doctor Parkes    :   26  
##  31-10-2015:  106   Wolverhampton: 2113   Chookie Royale   :   23  
##  10-10-2015:  104   Lingfield    : 2058   Oriental Relation:   21  
##  26-12-2015:  104   Ascot        : 1355   Sennockian Star  :   21  
##  09-01-2016:  101   SouthWell    : 1326   Barnet Fair      :   20  
##  06-08-2016:  100   Newmarket    : 1291   Silviniaco Conti :   19  
##  (Other)   :37623   (Other)      :27908   (Other)          :38118  
##      Bet.Type          Odds         Result      TipsterActive  
##  Each Way: 7830   Min.   :  1.07   Lose:30565   Mode :logical  
##  Win     :30417   1st Qu.:  5.00   Win : 7683   FALSE:13062    
##  win     :    1   Median :  8.00                TRUE :25186    
##                   Mean   : 11.00                               
##                   3rd Qu.: 13.00                               
##                   Max.   :407.00                               
##                                                                
##  Predicted.Results
##  Lose:33074       
##  Win : 5174       
##                   
##                   
##                   
##                   
## 

hence,it is possible to impute the dataset using mice pacakage when the parameter maxit=0 if maxit > 0, it is not possible to impute the data because it shows below mentioned error Error: Cannot allocate vector of size 4.8 Gb.

Line graph

xval = setNames(data.frame(c(1:8)),c("X"))
library(plotly)

lineplot <- plot_ly(data, x =xval$X) %>%
  add_lines(y = ~missing_mean$Odds_mean, name = "Odds_mean",line = list(color = 'rgb(119, 244, 66)')) %>%
  add_lines(y = ~missing_median$Odds_median, name = "Odds_median",line=list(colot='rgb(229, 168, 36)')) %>%
  add_lines(y = ~missing_horse_mean$Odds_horse_mean, name = "Odds_particular_horse_mean",line=list(colot='rgb(77, 249, 223)')) %>%
  add_lines(y = ~missing_interpolate_Odds$Odds_ip, name = "Interpolated_Odds") %>%
  add_lines(y = ~tips_data[row.names(tips_data) %in% row.names(missing_mean),]$Odds, name = "Original_Odds") %>%
  layout(
    title = "Analysis of suitable method for missing values",
    xaxis = list(rangeslider=list(type="date"),title="X"),
    yaxis = list(title = "Odds"))
lineplot

Interpolation of the data points based on mean is the best possible method for filling in the missing values since the graph of original Odds and the interpolated_Odds almost represents the same and the RMSE value of interpolated_Odds is nearer to zero.

Question 2

Ans (a)

library(caret)
confusionMatrix(data$Predicted.Results,data$Result)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  Lose   Win
##       Lose 28670  4404
##       Win   1895  3279
##                                          
##                Accuracy : 0.8353         
##                  95% CI : (0.8316, 0.839)
##     No Information Rate : 0.7991         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.4156         
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9380         
##             Specificity : 0.4268         
##          Pos Pred Value : 0.8668         
##          Neg Pred Value : 0.6337         
##              Prevalence : 0.7991         
##          Detection Rate : 0.7496         
##    Detection Prevalence : 0.8647         
##       Balanced Accuracy : 0.6824         
##                                          
##        'Positive' Class : Lose           
## 
Accuracy
 (TP + TN)/(TP + TN + FP + FN)
 (28670 + 3279)/(28670 + 3279 + 1895 + 4404)
 = 0.8353

Precision
 TP/(TP + FP)
 28670/(28670 + 4404) 
 = 0.8668

Recall
 TP/(TP + FN)
 28670/(28670 + 1895)
 = 0.9380

 Misclassification/Error Rate
 (FP + FN)/(TP + TN + FP + FN)
 (1895 + 4404)/(28670 + 3279 + 1895 + 4404)
 ERR = 0.164

F1-score
 (2*Recall*Precision)/(Recall + Precision)
 (2 * 0.9380 * 0.8668)/(0.9380 + 0.8668)
 = 1.626/1.8048 = 0.9 

 F score with beta=2
 (1+ Beta^2) * (precision * recall) / ( (Beta^2 * precision) + recall )
 (5 * 0.8668 * 0.938)/((4 * 0.8668)+0.938)
 = 0.923

 F score with beta=0.5
(1.25 * 0.8668 * 0.938)/((0.25 * 0.8668)+0.938)
 = 0.88

Significance of beta

The general formula involves a positive real beta so that F-score measures the effectiveness of retrieval with respect to a user who attaches beta times as much importance to recall as precision.

Ans (b)

Predictions1 = c('Lose')
updated_data=cbind(data,Predictions1)
confusionMatrix(updated_data$Predictions1,updated_data$Result)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  Lose   Win
##       Lose 30565  7683
##       Win      0     0
##                                           
##                Accuracy : 0.7991          
##                  95% CI : (0.7951, 0.8031)
##     No Information Rate : 0.7991          
##     P-Value [Acc > NIR] : 0.5031          
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.7991          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.7991          
##          Detection Rate : 0.7991          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Lose            
## 
Accuracy
 (TP + TN)/(TP + TN + FP + FN)
(30565 + 0)/(30565 + 7683 + 0 + 0)
 = 0.7991

Precision
 TP/(TP + FP)
 30565 / (30565 + 7683)
 = 0.7991

 Recall
 TP/(TP + FN)
 30565/30565 + 0)
 = 1

 Misclassification/Error Rate
 (FP + FN)/(TP + TN + FP + FN)
 (7683 + 0)/(30565 + 7683 + 0 + 0)
 = 0.20

 F-score
 (2*Recall*Precision)/(Recall + Precision)
 = 0.889

What does this indicate?

This model has F1-score = 0.889 which is less than F1-score of previous model which is 0.92. We can conclude that this model is not better classifier.

Ans (c)

Accuracy is the most intuitive performance measure and it is simply a ratio of correctly predicted observation to the total observations. One may think that, if we have high accuracy then our model is best. Accuracy works best if false positives and false negatives have similar cost. If we have an uneven class distribution, accuracy is not enough for the evaluation of a classification model.

Recall is the ratio of correctly predicted positive observations to the all observations in actual class. Precision is the ratio of correctly predicted positive observations to the total predicted positive observations. F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into account. Since the F1-score is 0.92 which is high , we can conclude that the classifier used to predict the results is better than the classifier with low F1-score.

Question3

data2015<-read.csv("/Users/vikramg/Desktop/Data_Analytics/Assignment_4/Data2015.csv",header=TRUE)
data2016<-read.csv("/Users/vikramg/Desktop/Data_Analytics/Assignment_4/Data2016.csv",header=TRUE)
data2017<-read.csv("/Users/vikramg/Desktop/Data_Analytics/Assignment_4/Data2017.csv",header=TRUE)
c<-intersect(data2017$Country,data2016$Country)
country<-intersect(c,data2015$Country)
train2015<-subset(data2015,Country %in% country)
train2016<-subset(data2016,Country %in% country)
train<-rbind(train2015,train2016)
test<-subset(data2017,Country %in% country)

RMSE = function(m, o){
    sqrt(mean((m - o)^2))
}

Model-1 training and testing

model1<-lm(Happiness.Score ~ Economy..GDP.per.Capita. + Family + year + Health..Life.Expectancy.,data=train)
pred1<-predict.lm(model1,subset(test,TRUE,select=c(Economy..GDP.per.Capita., Family, year, Health..Life.Expectancy.)))

Model-2 training and testing

model2<-lm(Happiness.Score ~ Economy..GDP.per.Capita. + year + Health..Life.Expectancy.,data=train)
pred2<-predict.lm(model2,subset(test,TRUE,select=c(Economy..GDP.per.Capita., year, Health..Life.Expectancy.)))

RMS Error

RMSE(test$Happiness.Score,pred1)
## [1] 1.182017
RMSE(test$Happiness.Score,pred2)
## [1] 0.602027

Question4

library(caret)
library(InformationValue)
set.seed(100)
df<-read.csv("/Users/vikramg/Desktop/Data_Analytics/Assignment_4/School_Data.dms",header=TRUE)
Train_data<-upSample(x=df,y=factor(df$Pass))
test<-read.csv("/Users/vikramg/Desktop/Data_Analytics/Assignment_4/Test - Sheet1.csv",header=TRUE)
colnames(test)[9]="popularity"

Model-1 training

model1<-glm(Pass ~ Day1+Day2+Day3+Day4+Day5+Senior+Class_Prefect+Athlete+popularity,data=Train_data,family=binomial)
summary(model1)
## 
## Call:
## glm(formula = Pass ~ Day1 + Day2 + Day3 + Day4 + Day5 + Senior + 
##     Class_Prefect + Athlete + popularity, family = binomial, 
##     data = Train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7918  -1.0254   0.1083   1.0326   2.2529  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.10186    0.13747  -0.741  0.45871    
## Day1          -0.71755    0.22138  -3.241  0.00119 ** 
## Day2           0.08185    0.20529   0.399  0.69012    
## Day3          -0.40919    0.21795  -1.877  0.06046 .  
## Day4           1.57659    0.21019   7.501 6.33e-14 ***
## Day5          -0.17310    0.18941  -0.914  0.36077    
## Senior        -0.24305    0.24360  -0.998  0.31840    
## Class_Prefect  0.13014    0.35991   0.362  0.71767    
## Athlete        1.51479    0.61831   2.450  0.01429 *  
## popularity    -3.99837    1.19969  -3.333  0.00086 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 970.41  on 699  degrees of freedom
## Residual deviance: 865.41  on 690  degrees of freedom
## AIC: 885.41
## 
## Number of Fisher Scoring iterations: 4

Looking at the P-value of Day1,Day3,Day4,Athlete and Popularity seems to be more significant features

Model-2 training

model2<-glm(Pass ~Day1++Day3+Day4+Athlete+popularity,data=Train_data,family="binomial")
summary(model2)
## 
## Call:
## glm(formula = Pass ~ Day1 + +Day3 + Day4 + Athlete + popularity, 
##     family = "binomial", data = Train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7551  -1.0452   0.1548   1.0240   2.3637  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.1451     0.1317  -1.102 0.270624    
## Day1         -0.6839     0.2014  -3.395 0.000685 ***
## Day3         -0.4112     0.2064  -1.992 0.046354 *  
## Day4          1.5415     0.2030   7.594 3.11e-14 ***
## Athlete       1.5809     0.6106   2.589 0.009620 ** 
## popularity   -4.1662     1.1099  -3.754 0.000174 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 970.41  on 699  degrees of freedom
## Residual deviance: 867.34  on 694  degrees of freedom
## AIC: 879.34
## 
## Number of Fisher Scoring iterations: 4

Based on the AIC value Model2 comes out to be better

Model-1 testing and confusion matrix

predict1<-predict(model1,test,type="response")
optCutOff1<- optimalCutoff(test$Pass, predict1)[1]
confusionMatrix(test$Pass, predict1, threshold = optCutOff1)
##   0 1
## 0 6 3
## 1 1 0

Model-2 testing and confusion matrix

predict2<-predict(model2,test,type="response")
optCutOff2 <- optimalCutoff(test$Pass, predict2)[1]
confusionMatrix(test$Pass, predict2, threshold = optCutOff2)
##   0 1
## 0 6 3
## 1 1 0